Getting started

We first need to load the packages we’ll use and set up a theme for our plots. The main workhorse for our networks will be tidygraph and ggraph. As this is a basic primer, you can find more info on these packages here:

library(tidyverse) # For data manipulation/graphing
library(tidygraph) # Represent network data in tabular format
library(cowplot) # Make ggplot prettier
library(ggraph) # Graph networks

theme_set(theme_cowplot(12)) # Minimalist theme

# Colorblind friendly palette
cbPalette <- c("#999999", #grey
               "#E69F00", #orange
               "#56B4E9", #sky blue
               "#009E73", #bluish green 
               "#F0E442", #yellow
               "#0072B2", #blue
               "#D55E00", #vermillion
               "#CC79A7") #reddish purple

Matrices

Most network data can be represented as a matrix. Take for example the following data pertaining to expression of Vibrio cholerae virulence genes across 300 samples. Say we want to visualize correlation between genes/across samples. We will start wit raw data with normalized reads per transcript across experiments

# Raw data with normalized reads per transcript across experiments
virulence_tbl <- read_csv("data/VPI1_VPI2.csv") %>%
  filter(!(Transcript%in%c(paste0("VC",c(1785:1900)))))
head(virulence_tbl)
## # A tibble: 6 x 301
##   Transcript  Tn_1  Tn_2  Tn_3 TntfoX.strep_1 TntfoX.strep_2 TntfoX.strep_3
##   <chr>      <dbl> <dbl> <dbl>          <dbl>          <dbl>          <dbl>
## 1 VC0070      8.88  9.36  8.76           8.90           8.9            8.69
## 2 VC0583     15.6  15.2  15.2           15.3           14.9           15.1 
## 3 VC0817      8.86  9.61  8.99           8.99           9.16           8.67
## 4 VC0818      7.50  8.08  7.38           7.38           7.33           7.29
## 5 VC0819      7.52  7.85  7.47           7.64           7.61           7.31
## 6 VC0820      7.55  7.90  7.73           7.79           7.80           7.62
## # … with 294 more variables: TntfoY.strep_1 <dbl>, TntfoY.strep_2 <dbl>,
## #   TnqstR_1 <dbl>, TnqstR_2 <dbl>, FRT.Tn_1 <dbl>, FRT.TntfoX.strep_1 <dbl>,
## #   FRT.TntfoX.strep_2 <dbl>, FRT.TntfoX.strep_3 <dbl>,
## #   FRT.TntfoY.strep_1 <dbl>, FRT.TntfoY.strep_2 <dbl>,
## #   FRT.TntfoY.strep_3 <dbl>, FRT.TnqstR_1 <dbl>, FRT.TnqstR_2 <dbl>,
## #   FRT.TnqstR_3 <dbl>, TntfoY.strep_3 <dbl>, FRT.Tn_2 <dbl>, TnqstR_3 <dbl>,
## #   FRT.Tn_3 <dbl>, delta.qstR_TntfoX_1 <dbl>, delta.qstR_TntfoX_2 <dbl>,
## #   delta.hapR._TntfoX_1 <dbl>, delta.hapR._TntfoX_2 <dbl>,
## #   delta.qstR_TntfoX_3 <dbl>, delta.hapR._TntfoX_3 <dbl>, WT_TntfoX_1 <dbl>,
## #   WT_TntfoX_2 <dbl>, WT_TntfoX_3 <dbl>, WT_TntfoX_NA_1 <dbl>,
## #   WT_TntfoX_NA_2 <dbl>, WT_TntfoX_NA_3 <dbl>, WT_no_T_1 <dbl>,
## #   WT_no_T_2 <dbl>, WT_no_T_3 <dbl>, TntfoY_1 <dbl>, TntfoY_2 <dbl>,
## #   TntfoY_3 <dbl>, Empty_ctl_1 <dbl>, Empty_ctl_2 <dbl>, Empty_ctl_3 <dbl>,
## #   vqmr_1 <dbl>, vqmr_2 <dbl>, vqmr_3 <dbl>, water_1 <dbl>, water_2 <dbl>,
## #   water_3 <dbl>, AI_1 <dbl>, AI_2 <dbl>, AI_3 <dbl>, CAI_1 <dbl>,
## #   CAI_2 <dbl>, CAI_3 <dbl>, DPO_1 <dbl>, DPO_2 <dbl>, DPO_3 <dbl>,
## #   AI_CAI_1 <dbl>, AI_CAI_2 <dbl>, AI_DPO_1 <dbl>, AI_DPO_2 <dbl>,
## #   AI_CAI_3 <dbl>, AI_DPO_3 <dbl>, CAI_DPO_1 <dbl>, CAI_DPO_2 <dbl>,
## #   CAI_DPO_3 <dbl>, AI_CAI_DPO_1 <dbl>, AI_CAI_DPO_2 <dbl>,
## #   AI_CAI_DPO_3 <dbl>, Ctl_1 <dbl>, Ctl_2 <dbl>, vchM_KO_1 <dbl>,
## #   vchM_KO_2 <dbl>, QS.mutant_1 <dbl>, QS.mutant_TEX_1 <dbl>,
## #   QS.mutant_2 <dbl>, QS.mutant_TEX_2 <dbl>, QS.mutant_3 <dbl>,
## #   QS.mutant_TEX_3 <dbl>, QS.mutant_4 <dbl>, QS.mutant_TEX_4 <dbl>,
## #   WT_C6706_1 <dbl>, WT_C6706_TEX_1 <dbl>, WT_C6706_2 <dbl>,
## #   WT_C6706_TEX_2 <dbl>, WT_C6706_3 <dbl>, WT_C6706_TEX_3 <dbl>,
## #   WT_C6706_4 <dbl>, WT_C6706_TEX_4 <dbl>, ToxT_KO_1 <dbl>, ToxT_KO_2 <dbl>,
## #   ToxT_KO_3 <dbl>, ToxT_Rescue_1 <dbl>, ToxT_Rescue_2 <dbl>,
## #   ToxT_Rescue_3 <dbl>, dncV_KO_1 <dbl>, dncV_KO_2 <dbl>, dncV_KO_3 <dbl>,
## #   dncV_Rescue_1 <dbl>, dncV_Rescue_2 <dbl>, dncV_Rescue_3 <dbl>,
## #   lacZ_1 <dbl>, lacZ_hns_1 <dbl>, …

Next we need to convert this into a matrix with only numbers

vir_matrix <- as.matrix(virulence_tbl %>% column_to_rownames("Transcript"))
vir_matrix[1:10,1:10]
##          Tn_1   Tn_2   Tn_3 TntfoX.strep_1 TntfoX.strep_2 TntfoX.strep_3
## VC0070  8.878  9.365  8.756          8.895          8.900          8.686
## VC0583 15.573 15.231 15.178         15.297         14.887         15.092
## VC0817  8.856  9.612  8.988          8.991          9.158          8.667
## VC0818  7.499  8.083  7.375          7.381          7.334          7.286
## VC0819  7.518  7.847  7.471          7.642          7.609          7.312
## VC0820  7.551  7.898  7.726          7.789          7.796          7.625
## VC0821  6.950  7.160  7.060          7.067          6.787          6.951
## VC0822  8.444  8.727  8.546          8.735          8.647          8.445
## VC0823  8.814  9.230  8.799          9.271          9.105          9.014
## VC0824  8.554  8.412  8.516          8.813          8.675          8.918
##        TntfoY.strep_1 TntfoY.strep_2 TnqstR_1 TnqstR_2
## VC0070          8.624          8.811    8.807    9.150
## VC0583         15.196         14.897   15.415   15.234
## VC0817          8.564          9.001    8.797    9.389
## VC0818          7.323          7.268    7.333    7.637
## VC0819          7.425          7.568    7.347    7.906
## VC0820          7.546          7.801    7.561    8.029
## VC0821          6.620          6.832    6.854    7.142
## VC0822          8.967          9.059    8.572    8.837
## VC0823          9.187          9.444    8.666    9.291
## VC0824          9.069          8.785    8.540    8.689

Our correlation function, cor(), calculates correlations between columns so we need to rotate our data using the t() (transpose) function.

vir_matrix <- t(vir_matrix)
vir_matrix[1:10,1:10]
##                VC0070 VC0583 VC0817 VC0818 VC0819 VC0820 VC0821 VC0822 VC0823
## Tn_1            8.878 15.573  8.856  7.499  7.518  7.551  6.950  8.444  8.814
## Tn_2            9.365 15.231  9.612  8.083  7.847  7.898  7.160  8.727  9.230
## Tn_3            8.756 15.178  8.988  7.375  7.471  7.726  7.060  8.546  8.799
## TntfoX.strep_1  8.895 15.297  8.991  7.381  7.642  7.789  7.067  8.735  9.271
## TntfoX.strep_2  8.900 14.887  9.158  7.334  7.609  7.796  6.787  8.647  9.105
## TntfoX.strep_3  8.686 15.092  8.667  7.286  7.312  7.625  6.951  8.445  9.014
## TntfoY.strep_1  8.624 15.196  8.564  7.323  7.425  7.546  6.620  8.967  9.187
## TntfoY.strep_2  8.811 14.897  9.001  7.268  7.568  7.801  6.832  9.059  9.444
## TnqstR_1        8.807 15.415  8.797  7.333  7.347  7.561  6.854  8.572  8.666
## TnqstR_2        9.150 15.234  9.389  7.637  7.906  8.029  7.142  8.837  9.291
##                VC0824
## Tn_1            8.554
## Tn_2            8.412
## Tn_3            8.516
## TntfoX.strep_1  8.813
## TntfoX.strep_2  8.675
## TntfoX.strep_3  8.918
## TntfoY.strep_1  9.069
## TntfoY.strep_2  8.785
## TnqstR_1        8.540
## TnqstR_2        8.689

Now we run cor and figure out and get a beautiful correlation matrix

corr_matrix <- cor(vir_matrix)
corr_matrix[1:10,1:10]
##             VC0070      VC0583    VC0817     VC0818      VC0819     VC0820
## VC0070  1.00000000  0.16195291 0.2923376 0.30848546 -0.05990451 -0.1419211
## VC0583  0.16195291  1.00000000 0.4698289 0.05497381 -0.09315693 -0.2112110
## VC0817  0.29233762  0.46982891 1.0000000 0.72052321  0.47087945  0.3783796
## VC0818  0.30848546  0.05497381 0.7205232 1.00000000  0.64973191  0.6223768
## VC0819 -0.05990451 -0.09315693 0.4708795 0.64973191  1.00000000  0.9508117
## VC0820 -0.14192109 -0.21121095 0.3783796 0.62237685  0.95081168  1.0000000
## VC0821 -0.18789479 -0.23536283 0.2802374 0.53283252  0.90239355  0.9491916
## VC0822 -0.15024823 -0.19473395 0.3724130 0.58755926  0.90157928  0.9592002
## VC0823 -0.15392651 -0.03353641 0.4831163 0.66704341  0.81914945  0.8889545
## VC0824 -0.15376359 -0.10678858 0.1112739 0.19691159  0.67498754  0.7047740
##            VC0821     VC0822      VC0823     VC0824
## VC0070 -0.1878948 -0.1502482 -0.15392651 -0.1537636
## VC0583 -0.2353628 -0.1947339 -0.03353641 -0.1067886
## VC0817  0.2802374  0.3724130  0.48311632  0.1112739
## VC0818  0.5328325  0.5875593  0.66704341  0.1969116
## VC0819  0.9023936  0.9015793  0.81914945  0.6749875
## VC0820  0.9491916  0.9592002  0.88895446  0.7047740
## VC0821  1.0000000  0.9637688  0.89100152  0.7748621
## VC0822  0.9637688  1.0000000  0.92578074  0.7781170
## VC0823  0.8910015  0.9257807  1.00000000  0.6371455
## VC0824  0.7748621  0.7781170  0.63714551  1.0000000

After converting this matrix into a tidy format we can visualize our results in a heatmap using geom_tile

long_tbl <- corr_matrix %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column(var = "Transcript1") %>%
  pivot_longer(cols = -Transcript1, names_to = "Transcript2",
               values_to = "Correlation") %>%
  mutate(Transcript1 = as.character(Transcript1))
ggplot(long_tbl, aes(x = Transcript1,
                     y = Transcript2,
                     fill = Correlation)) +
  
  geom_tile() +
  scale_x_discrete(breaks = c(virulence_tbl$Transcript))+
  scale_fill_gradient2(low = "red",
                       high = "blue",
                       limits = c(-1,1)) +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1))

Man is this busy/ugly! This data would look much better if we could somehow filter to show only interesting connections and avoid redundant info. How do we do that?

Tidygraph and ggraph basics

We can make a network using tidygraph and ggraph! Tidygraph is an extension of existing table manipulation tidyverse packages (e.g. tidyr, dplyr) that can represent network/graph type data. Ggraph extends ggplot’s grammar/syntax style to visualizing network/graph data.

Let’s start by converting our previous correlation matrix into a tbl_graph object

vir_graph <- as_tbl_graph(corr_matrix)
vir_graph
## # A tbl_graph: 61 nodes and 3721 edges
## #
## # A directed multigraph with 1 component
## #
## # Node Data: 61 x 1 (active)
##   name  
##   <chr> 
## 1 VC0070
## 2 VC0583
## 3 VC0817
## 4 VC0818
## 5 VC0819
## 6 VC0820
## # … with 55 more rows
## #
## # Edge Data: 3,721 x 3
##    from    to weight
##   <int> <int>  <dbl>
## 1     1     1  1    
## 2     1     2  0.162
## 3     1     3  0.292
## # … with 3,718 more rows

As you can see, our tbl_graph is really just two tibbles now, one containing all of our node information and the other containing all the edge data. You can manipulate these tibbles using standard tidyverse functions (e.g. left_join, mutate, filter, select). We will give some examples shortly but let’s first see what this data looks like when we plot it using ggraph. Ggraph is based off of the syntax of ggplot and every network plot will require a ggraph call to specify data you want to graph as well as calls to a node and edge geom.

ggraph(vir_graph) + # ggraph call
  geom_node_point() + # node geom
  geom_edge_link() # edge geom

That took a while and is exceptionally ugly! Let’s apply a correlation cutoff. We’re only interested in genes with a correlation of > 0.5 or < -0.1 so let’s filter and replot that. To specify which tibble we want to filter on (either edges or nodes) we use the activate function

vir_graph_filtered <- vir_graph %>%
  activate(edges) %>%
  filter(weight >= 0.5 | weight < -0.1)

ggraph(vir_graph_filtered) +
  geom_node_point() +
  geom_edge_link()

Looking better, now let’s tweak things a bit so that we know what genes we’re using and the edges aren’t so overwhelming. Most ggraph geoms are analogous to existing ggplot geoms and take similar parameters. Here, we use an alpha value to control how transparent the edges are and use geom_node_text to label our nodes.

ggraph(vir_graph_filtered) +
  geom_edge_link(alpha = 0.1) +
  geom_node_point() +
  geom_node_text(aes(label = name)) # Note that you can use the repel=TRUE option force labels to avoid overlapping

Into the weeds

Coloring edges and nodes

We just went over the basics for tidygraph and ggraph, now let’s dig into the weeds a bit. Here we want to add some color to our graph so that we can quickly figure out some info about our nodes and edges. Let’s color nodes by location on the genome (VPI1, VPI2, or other) and color edges by the correlation between genes

vpi2 <- paste0("VC", c(1757:1810))
vpi1 <- paste0("VC0", c(817:847))

vir_graph_groups <- 
  vir_graph_filtered %>%
  activate(nodes) %>%
  mutate(group = 
           case_when(name %in% vpi1 ~ "VPI1",
                     name %in% vpi2 ~ "VPI2",
                     TRUE ~ "Other"))
  

ggraph(vir_graph_groups) +
  geom_edge_link(alpha = 0.25, aes(colour = weight)) +
  scale_edge_color_gradient2(low = "red",
                             mid = "grey50",
                       high = "blue",
                       limits = c(-1,1)) + 
  geom_node_point(aes(colour = group)) +
  scale_colour_manual(values = cbPalette)

The edge colors were a little faint and the nodes are a little small. call we improve this? Let’s use binary coloring for the edges (positive/negative instead of the numerical gradient) and increase the size of the nodes.

vir_graph_colours <- 
  vir_graph_groups %>%
  activate(edges) %>%
  mutate(corr_dir= if_else(weight > 0, 
                             "positive",
                             "negative"))
ggraph(vir_graph_colours) +
  geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group), size = 4) +
  scale_colour_manual(values = cbPalette)

Labels and shapes

Now let’s label all of the genes in the “Other” group and change the shape of the nodes based on their grouping.

vir_graph_shapes_labels <- 
  vir_graph_colours %>%
  activate(nodes) %>%
  mutate(name_label = if_else(group == "Other",
                              name,
                              NA_character_))
ggraph(vir_graph_shapes_labels) +
  geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group,
                      shape = group), size = 4) +
  scale_colour_manual(values = cbPalette) +
  geom_node_text(aes(label = name_label))

Facetting

You can also facet based on nodes or edges. This can be great if you want to show how the same network changes over time/different conditions.

Node facetting

ggraph(vir_graph_shapes_labels) +
  geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group,
                      shape = group), size = 4) +
  scale_colour_manual(values = cbPalette) +
  geom_node_text(aes(label = name_label)) +
  facet_nodes(~group)

Edge facetting

ggraph(vir_graph_shapes_labels) +
  geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group,
                      shape = group), size = 4) +
  scale_colour_manual(values = cbPalette) +
  geom_node_text(aes(label = name_label)) +
  facet_edges(~corr_dir)

Network analysis

You may want to characterize certain aspects of your network such as node centrality (i.e. connectedness) or cluster your nodes based on an established algorithm/network topology. Tidygraph is loaded with a ton of convenient functions to do just that! Let’s run through a quick example that calculates the centrality of our nodes and clusters them.

vir_graph_cent_clust <- 
  vir_graph_shapes_labels %>%
  activate(nodes) %>%
  mutate(
    central = centrality_closeness(),
    infomap = as.factor(group_infomap()))

ggraph(vir_graph_cent_clust) +
  geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = infomap,
                      shape = group,
                      size = central)) +
  scale_colour_manual(values = cbPalette) +
  geom_node_text(aes(label = name_label))

Normal Layouts

You can manually choose any layout supported by igraph (see here for some ideas: https://www.data-imaginist.com/2017/ggraph-introduction-layouts/). These can drastically change the appearance of your final plot. Here’s an example of the sample data visualized with 12 different layouts (“stress” is the default we’ve been using thus far).

layout_test <- 
  function(layout_choice,
           graph_in){
  plot_out <- ggraph(graph_in, layout = layout_choice) +
  geom_edge_link(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group,
                      shape = group), size = 4) +
  scale_colour_manual(values = cbPalette) +
  ggtitle(layout_choice) +
  geom_node_text(aes(label = name_label))
  
  return(plot_out)

  }

layouts <- c('star', 'circle', 'gem', 'dh', 'graphopt', 'grid', 'mds', 
                    'randomly', 'fr', 'drl', 'lgl', 'stress')

vir_graph_simple <- vir_graph_shapes_labels %>%
  activate(nodes) %>%
  filter(group != "VPI2")

plot_list <- lapply(layouts,
                    layout_test,
                    graph_in = vir_graph_simple)

plot_grid(plotlist = plot_list,
          ncol = 3)

Special Layouts

You may also want to employ some more specialized layouts in combination with specific node or edge geoms. Ggraph provides a lot of functionality including:

Linear

ggraph(vir_graph_shapes_labels, layout = "linear") +
  geom_edge_arc(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group,
                      shape = group), size = 4) +
  scale_colour_manual(values = cbPalette)

Circular

ggraph(vir_graph_shapes_labels, layout = "linear", circular = TRUE) +
  geom_edge_arc(alpha = 0.1, aes(colour = corr_dir)) +
  scale_edge_color_manual(values = c("positive" = "blue",
                                     "negative" = "red"))+
  geom_node_point(aes(colour = group,
                      shape = group), size = 4) +
  scale_colour_manual(values = cbPalette)

Dendrogram/tree

This is just a fun example, there are much better packages to make these with, e.g. ggtree.

vir_dist <- as.dist(corr_matrix)
vir_clust <- hclust(vir_dist)

clust_graph <- as_tbl_graph(vir_clust) %>%
  activate(nodes) %>%
  mutate(group = 
           case_when(label %in% vpi1 ~ "VPI1",
                     label %in% vpi2 ~ "VPI2",
                     label == "" ~ NA_character_,
                     TRUE ~ "Other"))
ggraph(clust_graph, layout = "dendrogram") +
  geom_edge_diagonal2(aes(colour = node.group)) +
  # geom_node_point(, size = 3) +
  geom_node_text(aes(label = label), angle = 45, nudge_y = -0.4, nudge_x = -0.6)

More to consider

We’ve covered the basics but there are a lot of edge (and node!) cases that could come up. Here are some links and tips that may be helpful as you work on your own network visualizations: